In [1]:
import pandas as pd
import os
# Specify the directory where your CSV files are located
directory = "data"
# List to store all DataFrames
all_dataframes = []
# Loop through each year (2018 to 2025)
for year in range(2018, 2026): # 2026 to include 2025
# Construct the filename (e.g., "SR2018.csv", "SR2019.csv", etc.)
filename = f"SR{year}.csv"
file_path = os.path.join(directory, filename)
# Check if the file exists
if os.path.exists(file_path):
try:
# Try reading with UTF-8 first, then fall back to Latin-1 if UTF-8 fails
try:
df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
except UnicodeDecodeError:
print(f"UTF-8 failed for {filename}. Trying Latin-1 encoding...")
df = pd.read_csv(file_path, encoding='latin1', on_bad_lines='skip')
# Optionally, add a 'Year' column to track the origin
df['Year'] = year
all_dataframes.append(df)
print(f"Successfully loaded {filename}")
except Exception as e:
print(f"Error loading {filename}: {e}")
else:
print(f"File {filename} not found")
# Concatenate all DataFrames into one
if all_dataframes:
combined_df = pd.concat(all_dataframes, ignore_index=True)
# Save the combined DataFrame to a new CSV file in the 'data' folder
output_file = os.path.join(directory, "SR_all_years.csv")
combined_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Combined CSV saved as {output_file}")
else:
print("No files were loaded. Check file paths and names.")
# Optional: Display the first few rows of the combined data
print("\nPreview of combined data:")
print(combined_df.head())
Successfully loaded SR2018.csv
Successfully loaded SR2019.csv
Successfully loaded SR2020.csv
Successfully loaded SR2021.csv
Successfully loaded SR2022.csv
Successfully loaded SR2023.csv
Successfully loaded SR2024.csv
UTF-8 failed for SR2025.csv. Trying Latin-1 encoding...
Successfully loaded SR2025.csv
Combined CSV saved as data/SR_all_years.csv
Preview of combined data:
Creation Date Status First 3 Chars of Postal Code \
0 2018-01-01 00:06:06.0000000 Closed M1N
1 2018-01-01 00:14:04.0000000 Closed M2M
2 2018-01-01 00:17:19.0000000 Closed M8V
3 2018-01-01 00:24:35.0000000 Closed M1K
4 2018-01-01 00:28:44.0000000 In-progress M4L
Intersection Street 1 Intersection Street 2 Ward \
0 NaN NaN Scarborough Southwest (20)
1 NaN NaN Willowdale (18)
2 NaN NaN Etobicoke-Lakeshore (03)
3 NaN NaN Scarborough Centre (21)
4 NaN NaN Beaches-East York (19)
Service Request Type Division \
0 Noise Municipal Licensing & Standards
1 Noise Municipal Licensing & Standards
2 INJUR/DIST WILDLIFE Municipal Licensing & Standards
3 Watermain-Possible Break Toronto Water
4 Water Service Line-No Water Toronto Water
Section Year
0 District Enforcement 2018
1 District Enforcement 2018
2 Toronto Animal Services 2018
3 District Ops 2018
4 District Ops 2018
In [2]:
# Load and inspect the combined CSV
# Load the combined CSV
combined_df = pd.read_csv("data/SR_all_years.csv")
# Display basic info
print("Combined DataFrame Info:")
print(combined_df.info())
# Check rows per year
print("\nRows per year:")
print(combined_df['Year'].value_counts().sort_index())
# Check missing values
print("\nMissing values per column:")
print(combined_df.isnull().sum())
Combined DataFrame Info: <class 'pandas.core.frame.DataFrame'> RangeIndex: 2771056 entries, 0 to 2771055 Data columns (total 10 columns): # Column Dtype --- ------ ----- 0 Creation Date object 1 Status object 2 First 3 Chars of Postal Code object 3 Intersection Street 1 object 4 Intersection Street 2 object 5 Ward object 6 Service Request Type object 7 Division object 8 Section object 9 Year int64 dtypes: int64(1), object(9) memory usage: 211.4+ MB None Rows per year: Year 2018 412095 2019 438914 2020 347748 2021 323880 2022 427903 2023 385975 2024 402824 2025 31717 Name: count, dtype: int64 Missing values per column: Creation Date 0 Status 0 First 3 Chars of Postal Code 0 Intersection Street 1 2383434 Intersection Street 2 2385286 Ward 0 Service Request Type 0 Division 0 Section 0 Year 0 dtype: int64
In [3]:
print (combined_df['Ward'].value_counts().sort_index())
Ward Beaches-East York (19) 150380 Davenport (09) 146529 Don Valley East (16) 63918 Don Valley North (17) 77239 Don Valley West (15) 112122 Eglinton-Lawrence (08) 142718 Etobicoke Centre (02) 122626 Etobicoke North (01) 86647 Etobicoke-Lakeshore (03) 158161 Humber River-Black Creek (07) 75289 Parkdale-High Park (04) 141563 Scarborough Centre (21) 105139 Scarborough North (23) 66251 Scarborough Southwest (20) 129584 Scarborough-Agincourt (22) 67949 Scarborough-Guildwood (24) 85788 Scarborough-Rouge Park (25) 105995 Spadina-Fort York (10) 106685 Toronto Centre (13) 93312 Toronto-Danforth (14) 162181 Toronto-St. Paul's (12) 123903 University-Rosedale (11) 147825 Unknown 25 Willowdale (18) 83609 York Centre (06) 96189 York South-Weston (05) 119429 Name: count, dtype: int64
In [4]:
# Cleaning the data
# Convert Creation Date to datetime
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'], errors='coerce')
# Extract time-based features
combined_df['Month'] = combined_df['Creation Date'].dt.month
combined_df['DayOfWeek'] = combined_df['Creation Date'].dt.dayofweek # 0 = Monday, 6 = Sunday
combined_df['Hour'] = combined_df['Creation Date'].dt.hour
# Create Season feature
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else:
return 'Fall'
combined_df['Season'] = combined_df['Month'].apply(get_season)
# Drop columns with excessive missing values
combined_df = combined_df.drop(columns=['Intersection Street 1', 'Intersection Street 2'])
# Check for any invalid datetime conversions
print("Rows with invalid Creation Date:", combined_df['Creation Date'].isna().sum())
# Preview cleaned data
print("\nCleaned DataFrame Info:")
print(combined_df.info())
print("\nPreview of cleaned data:")
print(combined_df.head())
# Save cleaned data
cleaned_file = "data/SR_all_years_cleaned.csv"
combined_df.to_csv(cleaned_file, index=False, encoding='utf-8')
print(f"Cleaned data saved as {cleaned_file}")
Rows with invalid Creation Date: 0
Cleaned DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2771056 entries, 0 to 2771055
Data columns (total 12 columns):
# Column Dtype
--- ------ -----
0 Creation Date datetime64[ns]
1 Status object
2 First 3 Chars of Postal Code object
3 Ward object
4 Service Request Type object
5 Division object
6 Section object
7 Year int64
8 Month int32
9 DayOfWeek int32
10 Hour int32
11 Season object
dtypes: datetime64[ns](1), int32(3), int64(1), object(7)
memory usage: 222.0+ MB
None
Preview of cleaned data:
Creation Date Status First 3 Chars of Postal Code \
0 2018-01-01 00:06:06 Closed M1N
1 2018-01-01 00:14:04 Closed M2M
2 2018-01-01 00:17:19 Closed M8V
3 2018-01-01 00:24:35 Closed M1K
4 2018-01-01 00:28:44 In-progress M4L
Ward Service Request Type \
0 Scarborough Southwest (20) Noise
1 Willowdale (18) Noise
2 Etobicoke-Lakeshore (03) INJUR/DIST WILDLIFE
3 Scarborough Centre (21) Watermain-Possible Break
4 Beaches-East York (19) Water Service Line-No Water
Division Section Year Month \
0 Municipal Licensing & Standards District Enforcement 2018 1
1 Municipal Licensing & Standards District Enforcement 2018 1
2 Municipal Licensing & Standards Toronto Animal Services 2018 1
3 Toronto Water District Ops 2018 1
4 Toronto Water District Ops 2018 1
DayOfWeek Hour Season
0 0 0 Winter
1 0 0 Winter
2 0 0 Winter
3 0 0 Winter
4 0 0 Winter
Cleaned data saved as data/SR_all_years_cleaned.csv
In [5]:
# Exploratory Data Analysis (EDA)
import matplotlib.pyplot as plt
import seaborn as sns
# 1. Request Volume by Year
plt.figure(figsize=(10, 6))
combined_df['Year'].value_counts().sort_index().plot(kind='bar', color='skyblue')
plt.title('Service Request Volume by Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Number of Requests')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 2. Request Volume by Season
plt.figure(figsize=(10, 6))
sns.countplot(data=combined_df, x='Season', order=['Winter', 'Spring', 'Summer', 'Fall'], palette='viridis')
plt.title('Service Request Volume by Season')
plt.xlabel('Season')
plt.ylabel('Number of Requests')
plt.tight_layout()
plt.show()
# 3. Top 10 Service Request Types
plt.figure(figsize=(12, 6))
top_types = combined_df['Service Request Type'].value_counts().head(10)
sns.barplot(x=top_types.values, y=top_types.index, palette='muted')
plt.title('Top 10 Service Request Types (2018-2025)')
plt.xlabel('Number of Requests')
plt.ylabel('Service Request Type')
plt.tight_layout()
plt.show()
# 4. Summary Statistics
print("\nSummary Statistics:")
print(combined_df.describe(include='all'))
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/577754450.py:17: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.countplot(data=combined_df, x='Season', order=['Winter', 'Spring', 'Summer', 'Fall'], palette='viridis')
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/577754450.py:27: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=top_types.values, y=top_types.index, palette='muted')
Summary Statistics:
Creation Date Status First 3 Chars of Postal Code \
count 2771056 2771056 2771056
unique NaN 9 101
top NaN Closed Intersection
freq NaN 1274923 387622
mean 2021-07-04 19:42:56.926410752 NaN NaN
min 2018-01-01 00:06:06 NaN NaN
25% 2019-08-07 11:38:18.249999872 NaN NaN
50% 2021-07-06 11:27:45.500000 NaN NaN
75% 2023-04-24 12:59:02 NaN NaN
max 2025-01-31 23:56:11 NaN NaN
std NaN NaN NaN
Ward Service Request Type \
count 2771056 2771056
unique 26 852
top Toronto-Danforth (14) Residential: Bin: Repair or Replace Lid
freq 162181 112751
mean NaN NaN
min NaN NaN
25% NaN NaN
50% NaN NaN
75% NaN NaN
max NaN NaN
std NaN NaN
Division Section Year \
count 2771056 2771056 2.771056e+06
unique 9 30 NaN
top Solid Waste Management Services Collections NaN
freq 1028897 1023003 NaN
mean NaN NaN 2.021026e+03
min NaN NaN 2.018000e+03
25% NaN NaN 2.019000e+03
50% NaN NaN 2.021000e+03
75% NaN NaN 2.023000e+03
max NaN NaN 2.025000e+03
std NaN NaN 2.073598e+00
Month DayOfWeek Hour Season
count 2.771056e+06 2.771056e+06 2.771056e+06 2771056
unique NaN NaN NaN 4
top NaN NaN NaN Summer
freq NaN NaN NaN 811628
mean 6.289808e+00 2.566666e+00 1.321489e+01 NaN
min 1.000000e+00 0.000000e+00 0.000000e+00 NaN
25% 4.000000e+00 1.000000e+00 1.000000e+01 NaN
50% 6.000000e+00 2.000000e+00 1.300000e+01 NaN
75% 9.000000e+00 4.000000e+00 1.600000e+01 NaN
max 1.200000e+01 6.000000e+00 2.300000e+01 NaN
std 3.292783e+00 1.772963e+00 4.314874e+00 NaN
In [6]:
# Histograms for Request Patterns by Year and Season
# 1. Histogram of Request Volume by Year
plt.figure(figsize=(10, 6))
plt.hist(combined_df['Year'], bins=range(2018, 2027), edgecolor='black', color='lightblue')
plt.title('Histogram of Service Request Volume by Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Number of Requests')
plt.xticks(range(2018, 2026))
plt.tight_layout()
plt.show()
# 2. Histogram of Top Request Types by Season
top_requests = combined_df['Service Request Type'].value_counts().head(5).index # Top 5 request types
season_request_df = combined_df[combined_df['Service Request Type'].isin(top_requests)]
plt.figure(figsize=(12, 8))
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
season_data = season_request_df[season_request_df['Season'] == season]
plt.hist(season_data['Service Request Type'], bins=5, alpha=0.5, label=season, edgecolor='black')
plt.title('Histogram of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Number of Requests')
plt.legend(title='Season')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# 3. Summary of Top Requests by Season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
season_counts = (combined_df[combined_df['Season'] == season]['Service Request Type']
.value_counts().head(5))
print(f"\n{season}:")
print(season_counts)
Top 5 Service Request Types by Season: Winter: Service Request Type Road - Pot hole 23941 Residential: Bin: Repair or Replace Lid 21916 Publication Request - Solid Waste 20652 Property Standards 18785 Res / Garbage / Not Picked Up 15227 Name: count, dtype: int64 Spring: Service Request Type Road - Pot hole 35207 Residential: Bin: Repair or Replace Lid 28035 Property Standards 21335 Residential Furniture / Not Picked Up 13681 Residential: Bin: Repair or Replace Body/Handle 13573 Name: count, dtype: int64 Summer: Service Request Type Residential: Bin: Repair or Replace Lid 33857 Property Standards 29675 General Pruning 27111 Long Grass and Weeds 22751 CADAVER WILDLIFE 18964 Name: count, dtype: int64 Fall: Service Request Type Residential: Bin: Repair or Replace Lid 28943 Property Standards 24650 CADAVER WILDLIFE 16031 Residential Furniture / Not Picked Up 14644 Cadaver - Wildlife 14287 Name: count, dtype: int64
In [7]:
# Heatmap of Top Request Types by Season
# Create a pivot table for the heatmap
pivot_table = pd.crosstab(combined_df['Season'],
combined_df['Service Request Type'],
values=combined_df['Service Request Type'].count(),
aggfunc='count').loc[['Winter', 'Spring', 'Summer', 'Fall'], top_requests]
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu', cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Season')
plt.tight_layout()
plt.show()
# 4. Summary of Top Requests by Season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
season_counts = (combined_df[combined_df['Season'] == season]['Service Request Type']
.value_counts().head(5))
print(f"\n{season}:")
print(season_counts)
Top 5 Service Request Types by Season: Winter: Service Request Type Road - Pot hole 23941 Residential: Bin: Repair or Replace Lid 21916 Publication Request - Solid Waste 20652 Property Standards 18785 Res / Garbage / Not Picked Up 15227 Name: count, dtype: int64 Spring: Service Request Type Road - Pot hole 35207 Residential: Bin: Repair or Replace Lid 28035 Property Standards 21335 Residential Furniture / Not Picked Up 13681 Residential: Bin: Repair or Replace Body/Handle 13573 Name: count, dtype: int64 Summer: Service Request Type Residential: Bin: Repair or Replace Lid 33857 Property Standards 29675 General Pruning 27111 Long Grass and Weeds 22751 CADAVER WILDLIFE 18964 Name: count, dtype: int64 Fall: Service Request Type Residential: Bin: Repair or Replace Lid 28943 Property Standards 24650 CADAVER WILDLIFE 16031 Residential Furniture / Not Picked Up 14644 Cadaver - Wildlife 14287 Name: count, dtype: int64
In [8]:
# Historical Data by Location (Ward)
# 1. Total Requests per Ward
ward_counts = combined_df['Ward'].value_counts()
print("\nTotal Service Requests by Ward (2018-2025):")
print(ward_counts)
# 2. Heatmap of Request Volume by Ward and Year
ward_year_pivot = pd.crosstab(combined_df['Ward'], combined_df['Year'])
plt.figure(figsize=(12, 10))
sns.heatmap(ward_year_pivot, annot=True, fmt='.0f', cmap='YlGnBu',
cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Service Request Volume by Ward and Year (2018-2025)')
plt.xlabel('Year')
plt.ylabel('Ward')
plt.tight_layout()
plt.show()
# 3. Top 5 Request Types per Ward
print("\nTop 5 Service Request Types by Ward:")
for ward in ward_counts.index[:5]: # Top 5 wards by request volume
ward_requests = (combined_df[combined_df['Ward'] == ward]['Service Request Type']
.value_counts().head(5))
print(f"\n{ward}:")
print(ward_requests)
Total Service Requests by Ward (2018-2025): Ward Toronto-Danforth (14) 162181 Etobicoke-Lakeshore (03) 158161 Beaches-East York (19) 150380 University-Rosedale (11) 147825 Davenport (09) 146529 Eglinton-Lawrence (08) 142718 Parkdale-High Park (04) 141563 Scarborough Southwest (20) 129584 Toronto-St. Paul's (12) 123903 Etobicoke Centre (02) 122626 York South-Weston (05) 119429 Don Valley West (15) 112122 Spadina-Fort York (10) 106685 Scarborough-Rouge Park (25) 105995 Scarborough Centre (21) 105139 York Centre (06) 96189 Toronto Centre (13) 93312 Etobicoke North (01) 86647 Scarborough-Guildwood (24) 85788 Willowdale (18) 83609 Don Valley North (17) 77239 Humber River-Black Creek (07) 75289 Scarborough-Agincourt (22) 67949 Scarborough North (23) 66251 Don Valley East (16) 63918 Unknown 25 Name: count, dtype: int64
Top 5 Service Request Types by Ward: Toronto-Danforth (14): Service Request Type Residential: Bin: Repair or Replace Lid 4933 Property Standards 4883 Road - Pot hole 4406 Sewer Service Line-Blocked 4288 Res / Garbage / Not Picked Up 4244 Name: count, dtype: int64 Etobicoke-Lakeshore (03): Service Request Type Residential: Bin: Repair or Replace Lid 4872 Property Standards 4758 Res / Garbage / Not Picked Up 4666 Road - Pot hole 4315 Res / Recycle / Not Picked Up 4258 Name: count, dtype: int64 Beaches-East York (19): Service Request Type Property Standards 5634 Residential: Bin: Repair or Replace Lid 4575 Res / Garbage / Not Picked Up 4525 Injured - Wildlife 4157 Sewer Service Line-Blocked 3836 Name: count, dtype: int64 University-Rosedale (11): Service Request Type Residential: Bin: Repair or Replace Lid 6395 Property Standards 4801 Road - Pot hole 4128 Residential: Bin: Repair or Replace Body/Handle 3090 Residential: Garbage Bin: Missing 3042 Name: count, dtype: int64 Davenport (09): Service Request Type Residential: Bin: Repair or Replace Lid 8876 Property Standards 5094 Residential: Bin: Repair or Replace Body/Handle 4711 Residential: Recycle Bin: Missing 3332 Residential: Garbage Bin: Missing 3332 Name: count, dtype: int64
In [9]:
# Final Modeling with Season Focus
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score
import matplotlib.pyplot as plt
import numpy as np
# Prepare data (filter out 2025)
model_df = combined_df[combined_df['Year'] != 2025].copy()
# 1. Volume Prediction: Random Forest
volume_df = model_df.groupby(['Year', 'Month', 'Season', 'Ward']).size().reset_index(name='Request_Count')
features_vol = ['Year', 'Month', 'Season', 'Ward']
for col in ['Season', 'Ward']:
volume_df[col] = LabelEncoder().fit_transform(volume_df[col])
X_vol = volume_df[features_vol]
y_vol = volume_df['Request_Count']
y_vol_log = np.log1p(y_vol) # Normalize
X_train_vol, X_test_vol, y_train_vol, y_test_vol = train_test_split(X_vol, y_vol_log,
test_size=0.3, random_state=42)
# Random Forest Regressor
rf_reg_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_reg_model.fit(X_train_vol, y_train_vol)
y_pred_rf = rf_reg_model.predict(X_test_vol)
# Evaluate
y_pred_rf_exp = np.expm1(y_pred_rf)
y_test_vol_exp = np.expm1(y_test_vol)
print("\nRandom Forest Regressor (Volume Prediction by Season/Ward/Month):")
print(f"R² Score: {r2_score(y_test_vol_exp, y_pred_rf_exp):.4f}")
print(f"Mean Squared Error: {mean_squared_error(y_test_vol_exp, y_pred_rf_exp):.4f}")
# Plot
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vol_exp, y_pred_rf_exp, alpha=0.5, color='purple')
plt.plot([y_test_vol_exp.min(), y_test_vol_exp.max()], [y_test_vol_exp.min(), y_test_vol_exp.max()], 'r--')
plt.title
plt.title('Predicted vs Actual Request Volumes (Random Forest)')
plt.xlabel('Actual Request Count')
plt.ylabel('Predicted Request Count')
plt.tight_layout()
plt.show()
# 2. Status Prediction: Random Forest
features_stat = ['Season', 'Service Request Type', 'Division', 'Ward']
for col in features_stat:
model_df[col] = LabelEncoder().fit_transform(model_df[col])
model_df['Status'] = model_df['Status'].apply(lambda x: 1 if x == 'Closed' else 0)
print("\nClass Distribution (Status):")
print(model_df['Status'].value_counts(normalize=True))
X_stat = model_df[features_stat]
y_stat = model_df['Status']
X_train_stat, X_test_stat, y_train_stat, y_test_stat = train_test_split(X_stat, y_stat,
test_size=0.3, random_state=42)
# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)
rf_model.fit(X_train_stat, y_train_stat)
y_pred_stat = rf_model.predict(X_test_stat)
# Evaluate
print("\nRandom Forest (Status Prediction by Season):")
print(f"Accuracy: {accuracy_score(y_test_stat, y_pred_stat):.4f}")
print(f"F1 Score: {f1_score(y_test_stat, y_pred_stat):.4f}")
# Feature importance
feat_importance = pd.Series(rf_model.feature_importances_, index=features_stat).sort_values(ascending=False)
print("\nFeature Importance (Random Forest):")
print(feat_importance)
Random Forest Regressor (Volume Prediction by Season/Ward/Month): R² Score: 0.7490 Mean Squared Error: 53492.1904
Class Distribution (Status): Status 0 0.534693 1 0.465307 Name: proportion, dtype: float64 Random Forest (Status Prediction by Season): Accuracy: 0.6470 F1 Score: 0.6557 Feature Importance (Random Forest): Service Request Type 0.695665 Division 0.231902 Ward 0.049178 Season 0.023256 dtype: float64
In [10]:
from sklearn.model_selection import GridSearchCV
# Example: Tuning RandomForestRegressor
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid, cv=5)
grid_search.fit(X_train_vol, y_train_vol)
print("Best parameters:", grid_search.best_params_)
Best parameters: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 200}
In [11]:
!pip install folium geopy pandas
Requirement already satisfied: folium in /opt/anaconda3/lib/python3.12/site-packages (0.19.5) Requirement already satisfied: geopy in /opt/anaconda3/lib/python3.12/site-packages (2.4.1) Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.12/site-packages (2.2.2) Requirement already satisfied: branca>=0.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (0.8.1) Requirement already satisfied: jinja2>=2.9 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (3.1.4) Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (from folium) (1.26.4) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2.32.3) Requirement already satisfied: xyzservices in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2022.9.0) Requirement already satisfied: geographiclib<3,>=1.52 in /opt/anaconda3/lib/python3.12/site-packages (from geopy) (2.0) Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas) (2023.3) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2>=2.9->folium) (2.1.3) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (3.3.2) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2.2.3) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2024.8.30)
In [12]:
!pip install geopandas
Requirement already satisfied: geopandas in /opt/anaconda3/lib/python3.12/site-packages (1.0.1) Requirement already satisfied: numpy>=1.22 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (1.26.4) Requirement already satisfied: pyogrio>=0.7.2 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (0.10.0) Requirement already satisfied: packaging in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (24.1) Requirement already satisfied: pandas>=1.4.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (2.2.2) Requirement already satisfied: pyproj>=3.3.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (3.7.1) Requirement already satisfied: shapely>=2.0.0 in /opt/anaconda3/lib/python3.12/site-packages (from geopandas) (2.0.7) Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.12/site-packages (from pandas>=1.4.0->geopandas) (2023.3) Requirement already satisfied: certifi in /opt/anaconda3/lib/python3.12/site-packages (from pyogrio>=0.7.2->geopandas) (2024.8.30) Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.12/site-packages (from python-dateutil>=2.8.2->pandas>=1.4.0->geopandas) (1.16.0)
In [13]:
import os
# Print the current working directory
print("Current working directory:", os.getcwd())
# List the contents of the 'data' folder
data_folder = 'data'
if os.path.exists(data_folder):
print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
print("The 'data' folder does not exist in the current working directory.")
Current working directory: /Users/priyankmali/Final Project EECS 1516 Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
In [14]:
import zipfile
# Path to the ZIP file
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'
# List the contents of the ZIP file without extracting
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_contents = zip_ref.namelist()
print("Contents of the ZIP file:", zip_contents)
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
In [15]:
import folium
import pandas as pd
import geopandas as gpd
from folium.plugins import HeatMap
import zipfile
import os
from IPython.display import display # Import display for inline rendering
# Toronto ward coordinates (latitude, longitude) - keeping this for reference
toronto_ward_coords = {
'Toronto-Danforth (14)': [43.6785, -79.3526],
'Etobicoke-Lakeshore (03)': [43.6216, -79.4996],
'Beaches-East York (19)': [43.6938, -79.2989],
'University-Rosedale (11)': [43.6659, -79.4009],
'Davenport (09)': [43.6690, -79.4611],
'Eglinton-Lawrence (08)': [43.7110, -79.4350],
'Parkdale-High Park (04)': [43.6452, -79.4496],
'Scarborough Southwest (20)': [43.7299, -79.2569],
'Toronto-St. Paul\'s (12)': [43.6895, -79.4083],
'Etobicoke Centre (02)': [43.6488, -79.5362],
'Don Valley East (16)': [43.7412, -79.3473],
'Don Valley North (17)': [43.7931, -79.3624],
'Don Valley West (15)': [43.7126, -79.3742],
'Etobicoke North (01)': [43.7224, -79.5678],
'Humber River-Black Creek (07)': [43.7589, -79.5136],
'Scarborough Centre (21)': [43.7766, -79.2579],
'Scarborough North (23)': [43.8345, -79.2626],
'Scarborough-Agincourt (22)': [43.7864, -79.2917],
'Scarborough-Guildwood (24)': [43.7631, -79.1887],
'Scarborough-Rouge Park (25)': [43.8026, -79.1872],
'Spadina-Fort York (10)': [43.6396, -79.4029],
'Toronto Centre (13)': [43.6532, -79.3832],
'Willowdale (18)': [43.7744, -79.4056],
'York Centre (06)': [43.7462, -79.4563],
'York South-Weston (05)': [43.6948, -79.5026]
}
# 1. Verify the 'data' folder and file
data_folder = 'data'
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'
# Print the current working directory and contents of the 'data' folder
print("Current working directory:", os.getcwd())
if os.path.exists(data_folder):
print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
print("The 'data' folder does not exist. Creating it now...")
os.makedirs(data_folder)
print("Please move the ZIP file to the 'data' folder and rerun the code.")
raise FileNotFoundError("The 'data' folder was created, but the ZIP file is missing.")
# Check if the ZIP file exists
if not os.path.exists(zip_path):
raise FileNotFoundError(f"The ZIP file {zip_path} does not exist. Please check the file name and path.")
# 2. Extract the ZIP file
extract_path = 'data/25-ward-model'
# Create the extraction directory if it doesn’t exist
if not os.path.exists(extract_path):
os.makedirs(extract_path)
# Extract the ZIP file
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
print("Contents of the ZIP file:", zip_ref.namelist())
zip_ref.extractall(extract_path)
print("ZIP file extracted successfully.")
except Exception as e:
print(f"Error extracting ZIP file: {e}")
raise
# List the extracted files to confirm
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)
# 3. Load ward boundary data (Shapefile)
# Look for a .shp file in the extracted files
shp_file = next((f for f in extracted_files if f.endswith('.shp')), None)
if not shp_file:
raise FileNotFoundError("No .shp file found in the extracted files.")
shp_path = os.path.join(extract_path, shp_file)
print(f"Loading Shapefile: {shp_path}")
wards_gdf = gpd.read_file(shp_path)
# Inspect the Shapefile
print("Columns in the Shapefile:")
print(wards_gdf.columns)
# Print the first few rows to identify the ward name field
print("\nFirst few rows of the Shapefile:")
print(wards_gdf.head())
# Specify the ward name field manually
ward_name_field = 'AREA_NAME' # Confirmed from your output
# Verify the ward name field exists
if ward_name_field not in wards_gdf.columns:
raise ValueError(f"Ward name field '{ward_name_field}' not found in the Shapefile. Available columns: {wards_gdf.columns}")
print(f"\nFirst few ward names (using {ward_name_field}):")
print(wards_gdf[ward_name_field].head())
print(f"\nNumber of wards in the Shapefile: {len(wards_gdf)}")
# 4. Get your ward counts (already calculated in your code)
ward_counts = combined_df['Ward'].value_counts()
# Convert ward_counts to a DataFrame for merging with Shapefile
ward_counts_df = ward_counts.reset_index()
ward_counts_df.columns = ['Ward', 'Request_Count']
# 5. Clean ward names in ward_counts_df to match the Shapefile
ward_counts_df['Ward'] = ward_counts_df['Ward'].str.replace(r'\s*\(\d+\)', '', regex=True)
# Debug: Print ward names to check for mismatches
print("\nWard names in Shapefile:")
print(wards_gdf[ward_name_field].tolist())
print("\nWard names in ward_counts_df (after cleaning):")
print(ward_counts_df['Ward'].tolist())
# 6. Merge the Shapefile with your request counts
wards_gdf = wards_gdf.merge(ward_counts_df, left_on=ward_name_field, right_on='Ward', how='left')
# Debug: Check for unmatched wards after the merge
unmatched_wards = wards_gdf[wards_gdf['Request_Count'].isna()][ward_name_field].tolist()
if unmatched_wards:
print("\nWarning: The following wards in the Shapefile did not match any wards in ward_counts_df:")
print(unmatched_wards)
else:
print("\nAll wards matched successfully.")
# 7. Create map centered on Toronto
toronto_center = [43.70, -79.42]
m = folium.Map(location=toronto_center, zoom_start=11)
# 8. Create a choropleth map for 311 requests
folium.Choropleth(
geo_data=wards_gdf,
name='311 Service Requests',
data=ward_counts_df,
columns=['Ward', 'Request_Count'],
key_on=f'feature.properties.{ward_name_field}',
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='311 Service Requests',
nan_fill_color='black',
nan_fill_opacity=0.3
).add_to(m)
# 9. Add tooltips to show ward name and request count on hover
folium.GeoJson(
wards_gdf,
style_function=lambda feature: {
'fillColor': 'transparent',
'color': 'black',
'weight': 0.5,
'fillOpacity': 0
},
tooltip=folium.GeoJsonTooltip(
fields=['Ward', 'Request_Count'],
aliases=['Ward:', 'Requests:'],
localize=True
)
).add_to(m)
# 10. Add ward name labels at the centroid of each ward
for _, row in wards_gdf.iterrows():
# Use the LONGITUDE and LATITUDE columns for the centroid
centroid = [row['LATITUDE'], row['LONGITUDE']]
ward_name = row[ward_name_field]
# Create a custom DivIcon with the ward name as text
folium.Marker(
location=centroid,
icon=folium.features.DivIcon(
icon_size=(150, 36), # Adjust size as needed
icon_anchor=(75, 18), # Center the text
html=f'<div style="font-size: 10pt; color: black; text-align: center; white-space: nowrap;">{ward_name}</div>'
)
).add_to(m)
# 11. Add layer control
folium.LayerControl().add_to(m)
# 12. Display the map inline
print("Displaying the map with ward name labels below:")
display(m) # Display the map in the notebook
Current working directory: /Users/priyankmali/Final Project EECS 1516
Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
ZIP file extracted successfully.
Extracted files: ['WARD_WGS84.sbx', 'WARD_WGS84.shx', 'WARD_WGS84.cpg', 'WARD_WGS84.shp', 'WARD_WGS84.dbf', 'WARD_WGS84.shp.xml', 'WARD_WGS84.sbn', 'WARD_WGS84_readme.txt', 'WARD_WGS84.prj']
Loading Shapefile: data/25-ward-model/WARD_WGS84.shp
Columns in the Shapefile:
Index(['AREA_ID', 'AREA_TYPE', 'AREA_S_CD', 'AREA_L_CD', 'AREA_NAME', 'X', 'Y',
'LONGITUDE', 'LATITUDE', 'geometry'],
dtype='object')
First few rows of the Shapefile:
AREA_ID AREA_TYPE AREA_S_CD AREA_L_CD AREA_NAME X \
0 2551040 WD18 16 16 Don Valley East 318237.290
1 2551044 WD18 03 03 Etobicoke-Lakeshore 303099.474
2 2551048 WD18 15 15 Don Valley West 314825.876
3 2551052 WD18 23 23 Scarborough North 324522.149
4 2551056 WD18 11 11 University-Rosedale 313306.543
Y LONGITUDE LATITUDE \
0 4844000.0 -79.33298 43.739716
1 4831000.0 -79.52087 43.621646
2 4843000.0 -79.37536 43.728396
3 4852000.0 -79.25467 43.809672
4 4837000.0 -79.39432 43.671139
geometry
0 POLYGON ((-79.31335 43.71699, -79.3195 43.7156...
1 POLYGON ((-79.49777 43.65198, -79.49725 43.651...
2 POLYGON ((-79.35232 43.71573, -79.35209 43.715...
3 POLYGON ((-79.22591 43.8396, -79.22556 43.8394...
4 POLYGON ((-79.39004 43.6905, -79.39004 43.6905...
First few ward names (using AREA_NAME):
0 Don Valley East
1 Etobicoke-Lakeshore
2 Don Valley West
3 Scarborough North
4 University-Rosedale
Name: AREA_NAME, dtype: object
Number of wards in the Shapefile: 25
Ward names in Shapefile:
['Don Valley East', 'Etobicoke-Lakeshore', 'Don Valley West', 'Scarborough North', 'University-Rosedale', 'Spadina-Fort York', 'Davenport', 'York South-Weston', 'Don Valley North', 'Eglinton-Lawrence', 'Etobicoke Centre', 'Etobicoke North', 'Parkdale-High Park', "Toronto-St. Paul's", 'Scarborough-Agincourt', 'Scarborough Centre', 'Scarborough-Guildwood', 'Scarborough-Rouge Park', 'Scarborough Southwest', 'Beaches-East York', 'Willowdale', 'York Centre', 'Humber River-Black Creek', 'Toronto-Danforth', 'Toronto Centre']
Ward names in ward_counts_df (after cleaning):
['Toronto-Danforth', 'Etobicoke-Lakeshore', 'Beaches-East York', 'University-Rosedale', 'Davenport', 'Eglinton-Lawrence', 'Parkdale-High Park', 'Scarborough Southwest', "Toronto-St. Paul's", 'Etobicoke Centre', 'York South-Weston', 'Don Valley West', 'Spadina-Fort York', 'Scarborough-Rouge Park', 'Scarborough Centre', 'York Centre', 'Toronto Centre', 'Etobicoke North', 'Scarborough-Guildwood', 'Willowdale', 'Don Valley North', 'Humber River-Black Creek', 'Scarborough-Agincourt', 'Scarborough North', 'Don Valley East', 'Unknown']
All wards matched successfully.
Displaying the map with ward name labels below:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [16]:
import folium
import pandas as pd
import geopandas as gpd
from folium.plugins import HeatMap
import zipfile
import os
from IPython.display import display # Import display for inline rendering
import matplotlib.pyplot as plt
import seaborn as sns
# Toronto ward coordinates (latitude, longitude) - keeping this for reference
toronto_ward_coords = {
'Toronto-Danforth (14)': [43.6785, -79.3526],
'Etobicoke-Lakeshore (03)': [43.6216, -79.4996],
'Beaches-East York (19)': [43.6938, -79.2989],
'University-Rosedale (11)': [43.6659, -79.4009],
'Davenport (09)': [43.6690, -79.4611],
'Eglinton-Lawrence (08)': [43.7110, -79.4350],
'Parkdale-High Park (04)': [43.6452, -79.4496],
'Scarborough Southwest (20)': [43.7299, -79.2569],
'Toronto-St. Paul\'s (12)': [43.6895, -79.4083],
'Etobicoke Centre (02)': [43.6488, -79.5362],
'Don Valley East (16)': [43.7412, -79.3473],
'Don Valley North (17)': [43.7931, -79.3624],
'Don Valley West (15)': [43.7126, -79.3742],
'Etobicoke North (01)': [43.7224, -79.5678],
'Humber River-Black Creek (07)': [43.7589, -79.5136],
'Scarborough Centre (21)': [43.7766, -79.2579],
'Scarborough North (23)': [43.8345, -79.2626],
'Scarborough-Agincourt (22)': [43.7864, -79.2917],
'Scarborough-Guildwood (24)': [43.7631, -79.1887],
'Scarborough-Rouge Park (25)': [43.8026, -79.1872],
'Spadina-Fort York (10)': [43.6396, -79.4029],
'Toronto Centre (13)': [43.6532, -79.3832],
'Willowdale (18)': [43.7744, -79.4056],
'York Centre (06)': [43.7462, -79.4563],
'York South-Weston (05)': [43.6948, -79.5026]
}
# 1. Verify the 'data' folder and file
data_folder = 'data'
zip_path = 'data/25-ward-model-december-2018-wgs84-latitude-longitude.zip'
# Print the current working directory and contents of the 'data' folder
print("Current working directory:", os.getcwd())
if os.path.exists(data_folder):
print("Contents of the 'data' folder:", os.listdir(data_folder))
else:
print("The 'data' folder does not exist. Creating it now...")
os.makedirs(data_folder)
print("Please move the ZIP file to the 'data' folder and rerun the code.")
raise FileNotFoundError("The 'data' folder was created, but the ZIP file is missing.")
# Check if the ZIP file exists
if not os.path.exists(zip_path):
raise FileNotFoundError(f"The ZIP file {zip_path} does not exist. Please check the file name and path.")
# 2. Extract the ZIP file
extract_path = 'data/25-ward-model'
# Create the extraction directory if it doesn’t exist
if not os.path.exists(extract_path):
os.makedirs(extract_path)
# Extract the ZIP file
try:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
print("Contents of the ZIP file:", zip_ref.namelist())
zip_ref.extractall(extract_path)
print("ZIP file extracted successfully.")
except Exception as e:
print(f"Error extracting ZIP file: {e}")
raise
# List the extracted files to confirm
extracted_files = os.listdir(extract_path)
print("Extracted files:", extracted_files)
# 3. Load ward boundary data (Shapefile)
# Look for a .shp file in the extracted files
shp_file = next((f for f in extracted_files if f.endswith('.shp')), None)
if not shp_file:
raise FileNotFoundError("No .shp file found in the extracted files.")
shp_path = os.path.join(extract_path, shp_file)
print(f"Loading Shapefile: {shp_path}")
wards_gdf = gpd.read_file(shp_path)
# Inspect the Shapefile
print("Columns in the Shapefile:")
print(wards_gdf.columns)
# Print the first few rows to identify the ward name field
print("\nFirst few rows of the Shapefile:")
print(wards_gdf.head())
# Specify the ward name field manually
ward_name_field = 'AREA_NAME' # Confirmed from your output
# Verify the ward name field exists
if ward_name_field not in wards_gdf.columns:
raise ValueError(f"Ward name field '{ward_name_field}' not found in the Shapefile. Available columns: {wards_gdf.columns}")
print(f"\nFirst few ward names (using {ward_name_field}):")
print(wards_gdf[ward_name_field].head())
print(f"\nNumber of wards in the Shapefile: {len(wards_gdf)}")
# 4. Inspect combined_df to confirm columns
print("\nColumns in combined_df:")
print(combined_df.columns)
# 5. Check the existing 'Season' column
print("\nUnique values in the 'Season' column:")
print(combined_df['Season'].unique())
# Verify the 'Season' column has the expected values
expected_seasons = {'Winter', 'Spring', 'Summer', 'Fall'}
if not set(combined_df['Season'].unique()).issubset(expected_seasons):
print("Recreating 'Season' column as it does not contain expected values...")
date_column = 'Creation Date' # Corrected column name
if date_column not in combined_df.columns:
raise KeyError(f"Date column '{date_column}' not found in combined_df. Available columns: {combined_df.columns}")
# Ensure the date column is in datetime format
combined_df[date_column] = pd.to_datetime(combined_df[date_column])
def get_season(month):
if month in [12, 1, 2]:
return 'Winter'
elif month in [3, 4, 5]:
return 'Spring'
elif month in [6, 7, 8]:
return 'Summer'
else: # months 9, 10, 11
return 'Fall'
# Add Season column
combined_df['Season'] = combined_df[date_column].dt.month.apply(get_season)
print("Updated unique values in the 'Season' column:")
print(combined_df['Season'].unique())
# 6. Identify the top 5 service request types overall
request_type_column = 'Service Request Type' # Confirmed from your output
if request_type_column not in combined_df.columns:
raise KeyError(f"Request type column '{request_type_column}' not found in combined_df. Available columns: {combined_df.columns}")
top_requests = combined_df[request_type_column].value_counts().head(5).index.tolist()
print("\nTop 5 Service Request Types Overall:")
print(top_requests)
# 7. Create a pivot table for the heatmap
pivot_table = pd.crosstab(combined_df['Season'],
combined_df[request_type_column],
values=combined_df[request_type_column],
aggfunc='count').loc[['Winter', 'Spring', 'Summer', 'Fall'], top_requests]
# 8. Plot the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_table, annot=True, fmt='.0f', cmap='YlGnBu', cbar_kws={'label': 'Number of Requests'})
plt.title('Heatmap of Top 5 Service Request Types by Season')
plt.xlabel('Service Request Type')
plt.ylabel('Season')
plt.tight_layout()
plt.show()
# 9. Summary of top requests by season
print("\nTop 5 Service Request Types by Season:")
for season in ['Winter', 'Spring', 'Summer', 'Fall']:
season_counts = (combined_df[combined_df['Season'] == season][request_type_column]
.value_counts().head(5))
print(f"\n{season}:")
print(season_counts)
# 10. Calculate service requests by season and ward
seasonal_counts = combined_df.groupby(['Season', 'Ward']).size().reset_index(name='Request_Count')
# Pivot the data to have a column for each season
seasonal_pivot = seasonal_counts.pivot(index='Ward', columns='Season', values='Request_Count').fillna(0).reset_index()
# Clean ward names in seasonal_pivot to match the Shapefile
seasonal_pivot['Ward'] = seasonal_pivot['Ward'].str.replace(r'\s*\(\d+\)', '', regex=True)
# 11. Merge the seasonal data with the Shapefile
wards_gdf_seasonal = wards_gdf.merge(seasonal_pivot, left_on=ward_name_field, right_on='Ward', how='left')
# Debug: Check for unmatched wards after the merge
unmatched_wards = wards_gdf_seasonal[wards_gdf_seasonal['Winter'].isna()][ward_name_field].tolist()
if unmatched_wards:
print("\nWarning: The following wards in the Shapefile did not match any wards in seasonal_pivot:")
print(unmatched_wards)
else:
print("\nAll wards matched successfully for seasonal data.")
# 12. Create map centered on Toronto
toronto_center = [43.70, -79.42]
m = folium.Map(location=toronto_center, zoom_start=11)
# 13. Create a choropleth layer for each season
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
season_colors = {
'Winter': 'Blues',
'Spring': 'Greens',
'Summer': 'Oranges',
'Fall': 'Reds'
}
for season in seasons:
folium.Choropleth(
geo_data=wards_gdf_seasonal,
name=f'311 Requests - {season}',
data=wards_gdf_seasonal,
columns=['Ward', season],
key_on=f'feature.properties.{ward_name_field}',
fill_color=season_colors[season], # Use a different color for each season
fill_opacity=0.7,
line_opacity=0.2,
legend_name=f'311 Service Requests ({season})',
nan_fill_color='black',
nan_fill_opacity=0.3
).add_to(m)
# 14. Add tooltips to show ward name and request count for each season
for season in seasons:
folium.GeoJson(
wards_gdf_seasonal,
style_function=lambda feature: {
'fillColor': 'transparent',
'color': 'black',
'weight': 0.5,
'fillOpacity': 0
},
tooltip=folium.GeoJsonTooltip(
fields=['Ward', season],
aliases=['Ward:', f'Requests ({season}):'],
localize=True
),
name=f'Tooltips - {season}' # Name the layer for the layer control
).add_to(m)
# 15. Add ward name labels at the centroid of each ward
for _, row in wards_gdf.iterrows():
centroid = [row['LATITUDE'], row['LONGITUDE']]
ward_name = row[ward_name_field]
folium.Marker(
location=centroid,
icon=folium.features.DivIcon(
icon_size=(150, 36),
icon_anchor=(75, 18),
html=f'<div style="font-size: 10pt; color: black; text-align: center; white-space: nowrap;">{ward_name}</div>'
)
).add_to(m)
# 16. Add layer control
folium.LayerControl().add_to(m)
# 17. Display the map inline
print("Displaying the map with seasonal layers below:")
display(m)
Current working directory: /Users/priyankmali/Final Project EECS 1516
Contents of the 'data' folder: ['25-ward-model', '25-ward-model-december-2018-wgs84-latitude-longitude.zip', 'SR_all_years_cleaned.csv', 'SR_train_with_weather.csv', '.ipynb_checkpoints', 'SR2021.csv', 'SR2020.csv', 'SR2022.csv', 'SR2023.csv', 'SR_all_years.csv', 'SR2018.csv', 'SR2024.csv', 'SR2025.csv', 'SR2019.csv']
Contents of the ZIP file: ['WARD_WGS84.cpg', 'WARD_WGS84.dbf', 'WARD_WGS84.prj', 'WARD_WGS84.sbn', 'WARD_WGS84.sbx', 'WARD_WGS84.shp', 'WARD_WGS84.shp.xml', 'WARD_WGS84.shx', 'WARD_WGS84_readme.txt']
ZIP file extracted successfully.
Extracted files: ['WARD_WGS84.sbx', 'WARD_WGS84.shx', 'WARD_WGS84.cpg', 'WARD_WGS84.shp', 'WARD_WGS84.dbf', 'WARD_WGS84.shp.xml', 'WARD_WGS84.sbn', 'WARD_WGS84_readme.txt', 'WARD_WGS84.prj']
Loading Shapefile: data/25-ward-model/WARD_WGS84.shp
Columns in the Shapefile:
Index(['AREA_ID', 'AREA_TYPE', 'AREA_S_CD', 'AREA_L_CD', 'AREA_NAME', 'X', 'Y',
'LONGITUDE', 'LATITUDE', 'geometry'],
dtype='object')
First few rows of the Shapefile:
AREA_ID AREA_TYPE AREA_S_CD AREA_L_CD AREA_NAME X \
0 2551040 WD18 16 16 Don Valley East 318237.290
1 2551044 WD18 03 03 Etobicoke-Lakeshore 303099.474
2 2551048 WD18 15 15 Don Valley West 314825.876
3 2551052 WD18 23 23 Scarborough North 324522.149
4 2551056 WD18 11 11 University-Rosedale 313306.543
Y LONGITUDE LATITUDE \
0 4844000.0 -79.33298 43.739716
1 4831000.0 -79.52087 43.621646
2 4843000.0 -79.37536 43.728396
3 4852000.0 -79.25467 43.809672
4 4837000.0 -79.39432 43.671139
geometry
0 POLYGON ((-79.31335 43.71699, -79.3195 43.7156...
1 POLYGON ((-79.49777 43.65198, -79.49725 43.651...
2 POLYGON ((-79.35232 43.71573, -79.35209 43.715...
3 POLYGON ((-79.22591 43.8396, -79.22556 43.8394...
4 POLYGON ((-79.39004 43.6905, -79.39004 43.6905...
First few ward names (using AREA_NAME):
0 Don Valley East
1 Etobicoke-Lakeshore
2 Don Valley West
3 Scarborough North
4 University-Rosedale
Name: AREA_NAME, dtype: object
Number of wards in the Shapefile: 25
Columns in combined_df:
Index(['Creation Date', 'Status', 'First 3 Chars of Postal Code', 'Ward',
'Service Request Type', 'Division', 'Section', 'Year', 'Month',
'DayOfWeek', 'Hour', 'Season'],
dtype='object')
Unique values in the 'Season' column:
['Winter' 'Spring' 'Summer' 'Fall']
Top 5 Service Request Types Overall:
['Residential: Bin: Repair or Replace Lid', 'Property Standards', 'Road - Pot hole', 'Res / Garbage / Not Picked Up', 'Residential Furniture / Not Picked Up']
Top 5 Service Request Types by Season: Winter: Service Request Type Road - Pot hole 23941 Residential: Bin: Repair or Replace Lid 21916 Publication Request - Solid Waste 20652 Property Standards 18785 Res / Garbage / Not Picked Up 15227 Name: count, dtype: int64 Spring: Service Request Type Road - Pot hole 35207 Residential: Bin: Repair or Replace Lid 28035 Property Standards 21335 Residential Furniture / Not Picked Up 13681 Residential: Bin: Repair or Replace Body/Handle 13573 Name: count, dtype: int64 Summer: Service Request Type Residential: Bin: Repair or Replace Lid 33857 Property Standards 29675 General Pruning 27111 Long Grass and Weeds 22751 CADAVER WILDLIFE 18964 Name: count, dtype: int64 Fall: Service Request Type Residential: Bin: Repair or Replace Lid 28943 Property Standards 24650 CADAVER WILDLIFE 16031 Residential Furniture / Not Picked Up 14644 Cadaver - Wildlife 14287 Name: count, dtype: int64 All wards matched successfully for seasonal data. Displaying the map with seasonal layers below:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [17]:
import pandas as pd
# Ensure 'Creation Date' is in datetime format
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
# Filter combined_df to the period from 2018-01-01 to 2025-03-31
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
print("Date range of combined_df after filtering:")
print(f"Min date: {combined_df['Creation Date'].min()}")
print(f"Max date: {combined_df['Creation Date'].max()}")
Date range of combined_df after filtering: Min date: 2018-01-01 00:06:06 Max date: 2025-01-31 23:56:11
In [18]:
import numpy as np
# Create a date range from 2018-01-01 to 2025-03-26
dates = pd.date_range(start='2018-01-01', end='2025-03-26', freq='D')
# Simulate daily weather data for Toronto
np.random.seed(42) # For reproducibility
weather_data = {
'Date': dates,
'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)), # Placeholder
'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)), # Placeholder
'Total_Snowfall_cm': np.zeros(len(dates)) # Placeholder
}
# Convert to DataFrame
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month
# Adjust temperature based on monthly averages
# Jan: -2.3°C, Jul: 24.8°C, with seasonal variation
for month in range(1, 13):
if month in [1, 2, 12]: # Winter months
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [6, 7, 8]: # Summer months
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [3, 4, 5]: # Spring months
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
else: # Fall months (9, 10, 11)
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))
# Adjust rainfall: February (19mm/month), July (53mm/month)
# Distribute monthly rainfall across days
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54} # From web ID: 6, 19
for month in range(1, 13):
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
monthly_total = monthly_rainfall[month]
daily_avg = monthly_total / (days_in_month / 2) # Assume rain on half the days
rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5]) # 50% chance of rain
weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)
# Adjust snowfall: November to April, with January peak (57mm/month)
# Snowfall occurs from November to April
for month in range(1, 13):
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
if month in [11, 12, 1, 2, 3, 4]: # Snowfall months
monthly_snow = 57 if month == 1 else 223 / 6 # Distribute annual 223mm over 6 months, with January peak
daily_avg = monthly_snow / (days_in_month / 4) # Assume snow on 25% of days
snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25]) # 25% chance of snow
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
else:
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0
# Drop the Month column
weather_df = weather_df.drop(columns=['Month'])
print("Sample of simulated weather data (2018 to March 2025):")
print(weather_df.head())
Sample of simulated weather data (2018 to March 2025):
Date Mean_Temperature_C Total_Rainfall_mm Total_Snowfall_cm
0 2018-01-01 0.721472 0.000000 0.000000
1 2018-01-02 -0.746016 0.591381 0.000000
2 2018-01-03 -2.221386 0.022729 0.000000
3 2018-01-04 -3.210693 0.108828 1.077358
4 2018-01-05 -2.542752 0.000000 0.000000
In [19]:
# Extract the date (without time) for daily aggregation
combined_df['Date'] = combined_df['Creation Date'].dt.date
# Aggregate service requests by ward and date
daily_requests = combined_df.groupby(['Ward', 'Date']).size().reset_index(name='Request_Count')
# Convert 'Date' back to datetime for merging
daily_requests['Date'] = pd.to_datetime(daily_requests['Date'])
# Merge with the original combined_df to get the 'Season' column
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
daily_requests = daily_requests.merge(
combined_df[['Ward', 'Date', 'Season']].drop_duplicates(),
on=['Ward', 'Date'],
how='left'
)
print("Sample of daily service request counts by ward:")
print(daily_requests.head())
Sample of daily service request counts by ward:
Ward Date Request_Count Season
0 Beaches-East York (19) 2018-01-01 30 Winter
1 Beaches-East York (19) 2018-01-02 93 Winter
2 Beaches-East York (19) 2018-01-03 98 Winter
3 Beaches-East York (19) 2018-01-04 84 Winter
4 Beaches-East York (19) 2018-01-05 92 Winter
In [20]:
# Merge weather data with daily service request counts
daily_requests_weather = daily_requests.merge(weather_df, on='Date', how='left')
print("Sample of merged data:")
print(daily_requests_weather.head())
Sample of merged data:
Ward Date Request_Count Season \
0 Beaches-East York (19) 2018-01-01 30 Winter
1 Beaches-East York (19) 2018-01-02 93 Winter
2 Beaches-East York (19) 2018-01-03 98 Winter
3 Beaches-East York (19) 2018-01-04 84 Winter
4 Beaches-East York (19) 2018-01-05 92 Winter
Mean_Temperature_C Total_Rainfall_mm Total_Snowfall_cm
0 0.721472 0.000000 0.000000
1 -0.746016 0.591381 0.000000
2 -2.221386 0.022729 0.000000
3 -3.210693 0.108828 1.077358
4 -2.542752 0.000000 0.000000
In [21]:
import scipy.stats as stats
import numpy as np
# Initialize a dictionary to store correlations
correlations = {
'Ward': [],
'Season': [],
'Temperature_Correlation': [],
'Rainfall_Correlation': [],
'Snowfall_Correlation': []
}
# List of seasons and wards
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
wards = daily_requests_weather['Ward'].unique()
# Compute correlations for each ward and season
for ward in wards:
for season in seasons:
# Filter data for the current ward and season
subset = daily_requests_weather[(daily_requests_weather['Ward'] == ward) & (daily_requests_weather['Season'] == season)]
if len(subset) < 2: # Need at least 2 data points to compute correlation
corr_temp = corr_rain = corr_snow = np.nan
else:
# Compute Pearson correlations
corr_temp, _ = stats.pearsonr(subset['Request_Count'], subset['Mean_Temperature_C'])
corr_rain, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Rainfall_mm'])
corr_snow, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Snowfall_cm'])
# Store the results
correlations['Ward'].append(ward)
correlations['Season'].append(season)
correlations['Temperature_Correlation'].append(corr_temp)
correlations['Rainfall_Correlation'].append(corr_rain)
correlations['Snowfall_Correlation'].append(corr_snow)
# Create a DataFrame from the correlations
corr_df = pd.DataFrame(correlations)
print("Sample of correlation results:")
print(corr_df.head())
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:29: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined. corr_snow, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Snowfall_cm'])
Sample of correlation results:
Ward Season Temperature_Correlation \
0 Beaches-East York (19) Winter -0.041033
1 Beaches-East York (19) Spring 0.030053
2 Beaches-East York (19) Summer 0.060603
3 Beaches-East York (19) Fall -0.040910
4 Davenport (09) Winter -0.021434
Rainfall_Correlation Snowfall_Correlation
0 0.029495 -0.012289
1 -0.049212 -0.052994
2 -0.031875 NaN
3 0.041697 -0.004217
4 -0.020808 0.005881
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:27: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined. corr_temp, _ = stats.pearsonr(subset['Request_Count'], subset['Mean_Temperature_C']) /var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/1751068593.py:28: ConstantInputWarning: An input array is constant; the correlation coefficient is not defined. corr_rain, _ = stats.pearsonr(subset['Request_Count'], subset['Total_Rainfall_mm'])
In [22]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create a heatmap for each season
for season in seasons:
# Filter correlations for the current season
season_corr = corr_df[corr_df['Season'] == season].set_index('Ward')
# Drop the 'Season' column for the heatmap
season_corr = season_corr.drop(columns=['Season'])
# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(season_corr, annot=True, cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title(f'Pearson Correlations: Service Requests vs Weather Attributes ({season})')
plt.xlabel('Weather Attribute')
plt.ylabel('Ward')
plt.tight_layout()
plt.show()
In [23]:
import pandas as pd
import numpy as np
# Ensure 'Creation Date' is in datetime format and filter the date range
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
# Extract the date (without time) for merging with weather data
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
# Simulate weather data (replace with actual data from Environment Canada)
dates = pd.date_range(start='2018-01-01', end='2025-03-26', freq='D')
np.random.seed(42)
weather_data = {
'Date': dates,
'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)),
'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)),
'Total_Snowfall_cm': np.zeros(len(dates))
}
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month
for month in range(1, 13):
if month in [1, 2, 12]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [6, 7, 8]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [3, 4, 5]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
else:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54}
for month in range(1, 13):
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
monthly_total = monthly_rainfall[month]
daily_avg = monthly_total / (days_in_month / 2)
rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5])
weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)
for month in range(1, 13):
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
if month in [11, 12, 1, 2, 3, 4]:
monthly_snow = 57 if month == 1 else 223 / 6
daily_avg = monthly_snow / (days_in_month / 4)
snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25])
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
else:
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0
weather_df = weather_df.drop(columns=['Month'])
# Merge combined_df with weather data
combined_df_weather = combined_df.merge(weather_df, on='Date', how='left')
print("Sample of merged data:")
print(combined_df_weather[['Creation Date', 'Date', 'Service Request Type', 'Season', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']].head())
Sample of merged data:
Creation Date Date Service Request Type Season \
0 2018-01-01 00:06:06 2018-01-01 Noise Winter
1 2018-01-01 00:14:04 2018-01-01 Noise Winter
2 2018-01-01 00:17:19 2018-01-01 INJUR/DIST WILDLIFE Winter
3 2018-01-01 00:24:35 2018-01-01 Watermain-Possible Break Winter
4 2018-01-01 00:28:44 2018-01-01 Water Service Line-No Water Winter
Mean_Temperature_C Total_Rainfall_mm Total_Snowfall_cm
0 0.721472 0.0 0.0
1 0.721472 0.0 0.0
2 0.721472 0.0 0.0
3 0.721472 0.0 0.0
4 0.721472 0.0 0.0
In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Filter and prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
print("Unique values in the 'Season' column:")
print(combined_df['Season'].unique())
# Step 2: Identify top 5 requests per season
seasonal_requests = combined_df.groupby(['Season', 'Service Request Type']).size().reset_index(name='Count')
top_requests_by_season = {}
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
for season in seasons:
season_data = seasonal_requests[seasonal_requests['Season'] == season]
top_5 = season_data.nlargest(5, 'Count')
top_requests_by_season[season] = top_5
for season in seasons:
print(f"\nTop 5 Service Requests in {season}:")
print(top_requests_by_season[season][['Service Request Type', 'Count']])
# Step 3: Plot the top 5 requests for each season
sns.set(style="whitegrid")
for season in seasons:
data = top_requests_by_season[season]
plt.figure(figsize=(10, 6))
sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
plt.title(f'Top 5 Service Requests in {season} (2018–2025)', fontsize=14)
plt.xlabel('Number of Requests', fontsize=12)
plt.ylabel('Service Request Type', fontsize=12)
plt.tight_layout()
plt.show()
Unique values in the 'Season' column:
['Winter' 'Spring' 'Summer' 'Fall']
Top 5 Service Requests in Winter:
Service Request Type Count
2450 Road - Pot hole 23941
2409 Residential: Bin: Repair or Replace Lid 21916
2329 Publication Request - Solid Waste 20652
2324 Property Standards 18785
2351 Res / Garbage / Not Picked Up 15227
Top 5 Service Requests in Spring:
Service Request Type Count
1066 Road - Pot hole 35207
1029 Residential: Bin: Repair or Replace Lid 28035
958 Property Standards 21335
1026 Residential Furniture / Not Picked Up 13681
1028 Residential: Bin: Repair or Replace Body/Handle 13573
Top 5 Service Requests in Summer:
Service Request Type Count
1649 Residential: Bin: Repair or Replace Lid 33857
1585 Property Standards 29675
1427 General Pruning 27111
1497 Long Grass and Weeds 22751
1296 CADAVER WILDLIFE 18964
Top 5 Service Requests in Fall:
Service Request Type Count
384 Residential: Bin: Repair or Replace Lid 28943
321 Property Standards 24650
38 CADAVER WILDLIFE 16031
382 Residential Furniture / Not Picked Up 14644
41 Cadaver - Wildlife 14287
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/540139518.py:30: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Count', y='Service Request Type', data=data, palette='viridis')
In [25]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Step 1: Prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
# Step 2: Identify weekdays, weekends, and holidays
holidays = {
'2018-01-01': 'New Year’s Day', '2018-02-19': 'Family Day', '2018-03-30': 'Good Friday',
'2018-05-21': 'Victoria Day', '2018-07-02': 'Canada Day', '2018-09-03': 'Labour Day',
'2018-10-08': 'Thanksgiving', '2018-11-12': 'Remembrance Day', '2018-12-25': 'Christmas Day',
'2018-12-26': 'Boxing Day',
'2019-01-01': 'New Year’s Day', '2019-02-18': 'Family Day', '2019-04-19': 'Good Friday',
'2019-05-20': 'Victoria Day', '2019-07-01': 'Canada Day', '2019-09-02': 'Labour Day',
'2019-10-14': 'Thanksgiving', '2019-11-11': 'Remembrance Day', '2019-12-25': 'Christmas Day',
'2019-12-26': 'Boxing Day',
'2020-01-01': 'New Year’s Day', '2020-02-17': 'Family Day', '2020-04-10': 'Good Friday',
'2020-05-18': 'Victoria Day', '2020-07-01': 'Canada Day', '2020-09-07': 'Labour Day',
'2020-10-12': 'Thanksgiving', '2020-11-11': 'Remembrance Day', '2020-12-25': 'Christmas Day',
'2020-12-28': 'Boxing Day',
'2021-01-01': 'New Year’s Day', '2021-02-15': 'Family Day', '2021-04-02': 'Good Friday',
'2021-05-24': 'Victoria Day', '2021-07-01': 'Canada Day', '2021-09-06': 'Labour Day',
'2021-10-11': 'Thanksgiving', '2021-11-11': 'Remembrance Day', '2021-12-27': 'Christmas Day',
'2021-12-28': 'Boxing Day',
'2022-01-03': 'New Year’s Day', '2022-02-21': 'Family Day', '2022-04-15': 'Good Friday',
'2022-05-23': 'Victoria Day', '2022-07-01': 'Canada Day', '2022-09-05': 'Labour Day',
'2022-10-10': 'Thanksgiving', '2022-11-11': 'Remembrance Day', '2022-12-26': 'Christmas Day',
'2022-12-27': 'Boxing Day',
'2023-01-02': 'New Year’s Day', '2023-02-20': 'Family Day', '2023-04-07': 'Good Friday',
'2023-05-22': 'Victoria Day', '2023-07-03': 'Canada Day', '2023-09-04': 'Labour Day',
'2023-10-09': 'Thanksgiving', '2023-11-13': 'Remembrance Day', '2023-12-25': 'Christmas Day',
'2023-12-26': 'Boxing Day',
'2024-01-01': 'New Year’s Day', '2024-02-19': 'Family Day', '2024-03-29': 'Good Friday',
'2024-05-20': 'Victoria Day', '2024-07-01': 'Canada Day', '2024-09-02': 'Labour Day',
'2024-10-14': 'Thanksgiving', '2024-11-11': 'Remembrance Day', '2024-12-25': 'Christmas Day',
'2024-12-26': 'Boxing Day',
'2025-01-01': 'New Year’s Day', '2025-02-17': 'Family Day', '2025-04-18': 'Good Friday',
}
holidays = {pd.to_datetime(date): name for date, name in holidays.items()}
def classify_day(row):
date = row['Date']
if date in holidays:
return 'Holiday'
if date.weekday() >= 5:
return 'Weekend'
return 'Weekday'
combined_df['Day_Type'] = combined_df.apply(classify_day, axis=1)
print("Sample of data with Day_Type:")
print(combined_df[['Date', 'Day_Type']].head())
# Step 3: Compute average daily calls
daily_calls = combined_df.groupby(['Date', 'Day_Type']).size().reset_index(name='Call_Count')
avg_daily_calls = daily_calls.groupby('Day_Type')['Call_Count'].mean().reset_index()
avg_daily_calls['Day_Type'] = pd.Categorical(avg_daily_calls['Day_Type'], categories=['Weekday', 'Weekend', 'Holiday'], ordered=True)
avg_daily_calls = avg_daily_calls.sort_values('Day_Type')
print("\nAverage daily calls by Day_Type:")
print(avg_daily_calls)
# Step 4: Create the bar graph
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.barplot(x='Day_Type', y='Call_Count', data=avg_daily_calls, palette='muted')
plt.title('Average Daily 311 Service Requests by Day Type (2018–2025)', fontsize=14)
plt.xlabel('Day Type', fontsize=12)
plt.ylabel('Average Daily Calls', fontsize=12)
plt.tight_layout()
plt.show()
Sample of data with Day_Type:
Date Day_Type
0 2018-01-01 Holiday
1 2018-01-01 Holiday
2 2018-01-01 Holiday
3 2018-01-01 Holiday
4 2018-01-01 Holiday
Average daily calls by Day_Type:
Day_Type Call_Count
1 Weekday 1328.509499
2 Weekend 587.183333
0 Holiday 597.985294
/var/folders/07/3vs0pyfj68xd69bt_2bj8ln40000gn/T/ipykernel_3516/3918527136.py:68: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x='Day_Type', y='Call_Count', data=avg_daily_calls, palette='muted')
In [26]:
from sklearn.metrics import mean_absolute_error
# Compute additional error metrics
mae_rf = mean_absolute_error(y_test_vol_exp, y_pred_rf_exp)
rmse_rf = np.sqrt(mean_squared_error(y_test_vol_exp, y_pred_rf_exp))
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
Mean Absolute Error (MAE): 174.1699 Root Mean Squared Error (RMSE): 231.2838
In [27]:
from sklearn.metrics import precision_score, recall_score
# Compute additional classification metrics
precision_rf = precision_score(y_test_stat, y_pred_stat)
recall_rf = recall_score(y_test_stat, y_pred_stat)
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
Precision: 0.6008 Recall: 0.7215
In [28]:
import pandas as pd
import matplotlib.pyplot as plt
# Step 1: Prepare the data
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
start_date = pd.to_datetime('2018-01-01')
end_date = pd.to_datetime('2025-03-31')
combined_df = combined_df[(combined_df['Creation Date'] >= start_date) & (combined_df['Creation Date'] <= end_date)]
if 'Division' not in combined_df.columns:
raise KeyError("The 'Division' column is not found in combined_df. Please check the column name (e.g., 'Section - Unit', 'Department') and adjust the code.")
print("Unique values in the 'Division' column:")
print(combined_df['Division'].unique())
# Step 2: Compute proportions
division_counts = combined_df['Division'].value_counts()
total_requests = division_counts.sum()
division_proportions = division_counts / total_requests
division_df = pd.DataFrame({
'Division': division_counts.index,
'Count': division_counts.values,
'Proportion': division_proportions.values
})
division_df = division_df.sort_values(by='Count', ascending=False)
print("\nRequest counts and proportions by division:")
print(division_df)
# Step 3: Create the pie chart
labels = division_df['Division']
sizes = division_df['Proportion']
colors = plt.cm.Paired(range(len(labels)))
explode = [0.05] * len(labels)
plt.figure(figsize=(8, 12))
plt.pie(sizes, explode=explode, labels=None, colors=colors, autopct='%1.1f%%', startangle=140)
plt.title('Proportion of 311 Service Requests by Division (2018–2025)', fontsize=14)
plt.legend(labels, title="Division", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.axis('equal')
plt.show()
Unique values in the 'Division' column:
['Municipal Licensing & Standards' 'Toronto Water'
'Transportation Services' '311' 'Solid Waste Management Services'
'Urban Forestry' 'City of Toronto' 'Unknown' 'Parks']
Request counts and proportions by division:
Division Count Proportion
0 Solid Waste Management Services 1028897 0.371301
1 Municipal Licensing & Standards 645772 0.233042
2 Transportation Services 634676 0.229038
3 Toronto Water 249630 0.090085
4 Urban Forestry 122159 0.044084
5 311 53611 0.019347
6 Parks 20532 0.007409
7 Unknown 15327 0.005531
8 City of Toronto 452 0.000163
In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score,
accuracy_score, f1_score, precision_score, recall_score)
# Set random seed for reproducibility
np.random.seed(42)
# Load cleaned data
combined_df = pd.read_csv("data/SR_all_years_cleaned.csv")
combined_df['Creation Date'] = pd.to_datetime(combined_df['Creation Date'])
combined_df = combined_df[(combined_df['Creation Date'] >= '2018-01-01') &
(combined_df['Creation Date'] <= '2025-03-31')]
# Simulate weather data (from your earlier script)
dates = pd.date_range(start='2018-01-01', end='2025-03-31', freq='D') # Extended to match data
weather_data = {
'Date': dates,
'Mean_Temperature_C': np.random.normal(loc=10, scale=10, size=len(dates)),
'Total_Rainfall_mm': np.random.exponential(scale=2, size=len(dates)),
'Total_Snowfall_cm': np.zeros(len(dates))
}
weather_df = pd.DataFrame(weather_data)
weather_df['Month'] = weather_df['Date'].dt.month
monthly_rainfall = {1: 54, 2: 19, 3: 80, 4: 80, 5: 48, 6: 81, 7: 53, 8: 75, 9: 78, 10: 78, 11: 75, 12: 54}
for month in range(1, 13):
if month in [1, 2, 12]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=-2.3, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [6, 7, 8]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=24.8, scale=3, size=len(weather_df[weather_df['Month'] == month]))
elif month in [3, 4, 5]:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=12, scale=5, size=len(weather_df[weather_df['Month'] == month]))
else:
weather_df.loc[weather_df['Month'] == month, 'Mean_Temperature_C'] = np.random.normal(loc=15, scale=5, size=len(weather_df[weather_df['Month'] == month]))
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
monthly_total = monthly_rainfall[month]
daily_avg = monthly_total / (days_in_month / 2)
rain_days = np.random.choice([0, 1], size=days_in_month, p=[0.5, 0.5])
weather_df.loc[weather_df['Month'] == month, 'Total_Rainfall_mm'] = rain_days * np.random.exponential(scale=daily_avg, size=days_in_month)
for month in range(1, 13):
days_in_month = weather_df[weather_df['Month'] == month].shape[0]
if month in [11, 12, 1, 2, 3, 4]:
monthly_snow = 57 if month == 1 else 223 / 6
daily_avg = monthly_snow / (days_in_month / 4)
snow_days = np.random.choice([0, 1], size=days_in_month, p=[0.75, 0.25])
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = snow_days * np.random.exponential(scale=daily_avg, size=days_in_month)
else:
weather_df.loc[weather_df['Month'] == month, 'Total_Snowfall_cm'] = 0
weather_df = weather_df.drop(columns=['Month'])
# Merge weather data with combined_df
combined_df['Date'] = combined_df['Creation Date'].dt.date
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
combined_df = combined_df.merge(weather_df, on='Date', how='left')
# Filter out 2025 for training (train on 2018-2023, test on 2024)
train_df = combined_df[combined_df['Year'] != 2025].copy()
# Volume Prediction: Prepare Data
volume_df = train_df.groupby(['Year', 'Month', 'Season', 'Ward']).size().reset_index(name='Request_Count')
for col in ['Season', 'Ward']:
volume_df[col] = LabelEncoder().fit_transform(volume_df[col])
# Create a temporary date column for merging weather data
volume_df['Date'] = pd.to_datetime(volume_df[['Year', 'Month']].assign(day=1))
weather_subset = train_df[['Date', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']].drop_duplicates()
# Merge weather data and drop the Date column
volume_df = volume_df.merge(weather_subset, on='Date', how='left')
X_vol = volume_df[['Year', 'Month', 'Season', 'Ward', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']]
y_vol = np.log1p(volume_df['Request_Count'])
# Split data
X_train_vol, X_test_vol, y_train_vol, y_test_vol = train_test_split(X_vol, y_vol, test_size=0.3, random_state=42)
# Volume Prediction: Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=200, max_depth=15, min_samples_split=2, random_state=42, n_jobs=-1)
rf_reg.fit(X_train_vol, y_train_vol)
y_pred_rf = np.expm1(rf_reg.predict(X_test_vol))
y_test_vol_exp = np.expm1(y_test_vol)
# Volume Evaluation
mae_rf = mean_absolute_error(y_test_vol_exp, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_test_vol_exp, y_pred_rf))
r2_rf = r2_score(y_test_vol_exp, y_pred_rf)
print("\nVolume Prediction (Random Forest):")
print(f"R² Score: {r2_rf:.4f}")
print(f"Mean Absolute Error (MAE): {mae_rf:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.4f}")
# Plot Actual vs Predicted Volumes
plt.figure(figsize=(10, 6))
plt.scatter(y_test_vol_exp, y_pred_rf, alpha=0.5, color='purple')
plt.plot([y_test_vol_exp.min(), y_test_vol_exp.max()], [y_test_vol_exp.min(), y_test_vol_exp.max()], 'r--')
plt.title('Actual vs Predicted 311 Request Volumes, (Random Forest)', fontsize=14)
plt.xlabel('Actual Request Count', fontsize=12)
plt.ylabel('Predicted Request Count', fontsize=12)
plt.tight_layout()
plt.savefig('figure1_volume.png') # Save for report
plt.show()
# Status Prediction: Prepare Data
for col in ['Season', 'Service Request Type', 'Division', 'Ward']:
train_df[col] = LabelEncoder().fit_transform(train_df[col])
train_df['Status'] = train_df['Status'].apply(lambda x: 1 if x == 'Closed' else 0)
X_stat = train_df[['Season', 'Service Request Type', 'Division', 'Ward', 'Mean_Temperature_C', 'Total_Rainfall_mm', 'Total_Snowfall_cm']]
y_stat = train_df['Status']
X_train_stat, X_test_stat, y_train_stat, y_test_stat = train_test_split(X_stat, y_stat, test_size=0.3, random_state=42)
# Status Prediction: Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=15, min_samples_split=2, class_weight='balanced', random_state=42, n_jobs=-1)
rf_clf.fit(X_train_stat, y_train_stat)
y_pred_stat = rf_clf.predict(X_test_stat)
# Status Evaluation
accuracy_rf = accuracy_score(y_test_stat, y_pred_stat)
f1_rf = f1_score(y_test_stat, y_pred_stat)
precision_rf = precision_score(y_test_stat, y_pred_stat)
recall_rf = recall_score(y_test_stat, y_pred_stat)
print("\nStatus Prediction (Random Forest):")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"F1-Score: {f1_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
# Feature Importance for Status Prediction
feat_importance = pd.Series(rf_clf.feature_importances_, index=X_stat.columns).sort_values(ascending=False)
print("\nFeature Importance (Status Prediction):")
print(feat_importance)
# Save updated data for reference
train_df.to_csv("data/SR_train_with_weather.csv", index=False)
print("\nUpdated training data saved as 'data/SR_train_with_weather.csv'")
Volume Prediction (Random Forest): R² Score: 0.8253 Mean Absolute Error (MAE): 145.0985 Root Mean Squared Error (RMSE): 192.9323
Status Prediction (Random Forest): Accuracy: 0.7306 F1-Score: 0.7263 Precision: 0.6893 Recall: 0.7674 Feature Importance (Status Prediction): Service Request Type 0.404704 Mean_Temperature_C 0.184243 Division 0.148536 Total_Rainfall_mm 0.116991 Total_Snowfall_cm 0.078485 Ward 0.038405 Season 0.028635 dtype: float64 Updated training data saved as 'data/SR_train_with_weather.csv'
In [ ]: